# This file defines the Arrow schema for the Newswhip data.
# Passing an explicit schema lets us avoid some nasty type errors
# down the line.
import pyarrow as pa

NEWSWHIP_SCHEMA = pa.schema([
  pa.field("delta_time", pa.int64()),
  pa.field("recent_fb_counts", pa.int64()),
  pa.field("relatedStories", pa.list_(pa.string())),
  pa.field("predicted_interactions", pa.float64()),
  pa.field("predicted_timestamp", pa.int64()),
  pa.field("uuid", pa.string()),
  pa.field("publication_timestamp", pa.int64()),
  pa.field("link", pa.string()),
  pa.field("headline", pa.string()),
  pa.field("excerpt", pa.string()),
  pa.field("keywords", pa.string()),
  pa.field("image_link", pa.string()),
  pa.field("has_video", pa.bool_()),
  pa.field("nw_score", pa.float64()),
  pa.field("max_nw_score", pa.float64()),
  pa.field("topics", pa.list_(pa.struct([("id", pa.int64()), ("name", pa.string())]))),
  #pa.field("authors", pa.list_(pa.string())),
  pa.field("authors", pa.string()),
  pa.field("entities", pa.list_(pa.string())),
  pa.field("article", pa.string()),
  pa.field("fb_data.total_engagement_count", pa.int64()),
  pa.field("fb_data.likes", pa.int64()),
  pa.field("fb_data.comments", pa.int64()),
  pa.field("fb_data.shares", pa.int64()),
  pa.field("fb_data.total_count_delta", pa.int64()),
  pa.field("fb_data.delta_period", pa.int64()),
  pa.field("fb_data.delta_period_unit", pa.string()),
  pa.field("tw_data.tw_count", pa.int64()),
  pa.field("tw_data.total_count_delta", pa.int64()),
  pa.field("tw_data.delta_period", pa.int64()),
  pa.field("tw_data.delta_period_unit", pa.string()),
  pa.field("li_data.li_count", pa.int64()),
  pa.field("li_data.total_count_delta", pa.int64()),
  pa.field("li_data.delta_period", pa.int64()),
  pa.field("li_data.delta_period_unit", pa.string()),
  pa.field("pi_data.pi_count", pa.int64()),
  pa.field("pi_data.delta_period", pa.int64()),
  pa.field("pi_data.delta_period_unit", pa.string()),
  pa.field("source.publisher", pa.string()),
  pa.field("source.domain", pa.string()),
  pa.field("source.link", pa.string()),
  pa.field("source.country", pa.string()),
  pa.field("source.country_code", pa.string()),
  pa.field("source.language", pa.string()),
  #pa.field("article.args", pa.list_(pa.string())),
  pa.field("creation_date", pa.timestamp('ns')),
  pa.field("year", pa.int64()),
  pa.field("month", pa.int64()),
])
